Generating the CellDataSet object
# QC of data after
pData(dat)$Total_mRNAs <- Matrix::colSums(exprs(dat))
qplot(Total_mRNAs, data = pData(dat), fill = level1class, geom = "density",
alpha = 0.1) + facet_wrap("level1class", scales = "free") + guides(fill = "none") +
theme_bw()
# Calculate the mean copies per cell among all classes (Bulk) and draw the
# density plot for lincRNAs vs protein coding
dat.means <- detectGenes(dat, min_expr = 1)
dat.means <- dat.means[fData(dat.means)$num_cells_expressed >= 1]
fData(dat.means)$mean_cpc <- apply(exprs(dat.means), 1, mean)
fData(dat.means)$sd <- apply(exprs(dat.means), 1, sd)
fData(dat.means)$BCV <- (fData(dat.means)$sd/fData(dat.means)$mean_cpc)^2
tmp <- data.frame(gene_short_name = fData(dat.means)$gene_short_name, gene_type = fData(dat.means)$transcript_type,
mean_cpc = fData(dat.means)$mean_cpc, BCV = fData(dat.means)$BCV, num_cells_expresed = fData(dat.means)$num_cells_expressed)
dat_means <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))
density.plot <- ggplot(dat_means) + geom_density(aes(x = log10(mean_cpc), color = gene_type)) +
scale_color_manual(values = c("red", "black")) + theme_bw()
density.plot
# List the lincRNAs that are expressed with a mean_cpc greater than 1
dat_lincRNA_sort <- subset(dat_means, gene_type %in% "lincRNA")
dat_mRNA_sort <- subset(dat_means, gene_type %in% "protein_coding")
# length(dat_lincRNA_sort$gene_short_name)
print("Number of lncRNAs = 441")
## [1] "Number of lncRNAs = 441"
# length(dat_mRNA_sort$gene_short_name)
print("Number of mRNAs = 17091")
## [1] "Number of mRNAs = 17091"
# Generate a boxplot on the batch data
dat_means$frac_cells_expressed <- dat_means$num_cells_expresed/length(rownames(pData(dat.means)))
boxplot <- ggplot(dat_means, aes(gene_type, frac_cells_expressed, fill = gene_type)) +
geom_boxplot(notch = T) + theme_bw() + scale_fill_manual(values = c("red",
"grey50"))
boxplot
# Generate empirical cumulative density function
ecdf <- ggplot(dat_means, aes(x = log10(BCV), color = gene_type)) + stat_ecdf(geom = "line") +
theme_bw() + scale_color_manual(values = c("red", "grey50"))
ecdf
# tSNE on all genes
dat.tSNE <- Rtsne(t(round(vstExprs(dat))), theta = 0.1)
pData(dat)$tSNE1_pos <- dat.tSNE$Y[, 1]
pData(dat)$tSNE2_pos <- dat.tSNE$Y[, 2]
p <- ggplot(pData(dat))
tSNE_cluster <- p + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) +
theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
tSNE_cluster
tSNE_level2class <- p + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = level2class)) +
theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9,
"Set1"))(length(unique(pData(tmp)$level2class))))
# tSNE on lincRNAS
tmp <- subset(dat, fData(dat)$transcript_type %in% "lincRNA")
lincRNA.dat.tSNE <- Rtsne(t(round(vstExprs(tmp))), theta = 0.1, check_duplicates = F)
pData(tmp)$tSNE1_pos <- lincRNA.dat.tSNE$Y[, 1]
pData(tmp)$tSNE2_pos <- lincRNA.dat.tSNE$Y[, 2]
q <- ggplot(pData(tmp))
lincRNA_tSNE <- q + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) +
theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
lincRNA_tSNE
# Color by level2class
lincRNA_tSNE_level2class <- q + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos,
color = level2class)) + theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9,
"Set1"))(length(unique(pData(tmp)$level2class))))
lincRNA_tSNE_level2class
# Color by Sex, size is reflective of XIST expression
lincRNA_tSNE_sex <- q + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = sex,
size = exprs(tmp)[1, ])) + theme_bw() + coord_equal(1) + scale_fill_brewer(palette = "Set1")
lincRNA_tSNE_sex
# Color by infered batch
pData(tmp)$cell_id <- colnames(exprs(tmp))
pData(tmp)$batch <- str_split_fixed(pData(tmp)$cell_id, "_", 2)[, 1]
j <- ggplot(pData(tmp))
lincRNA_tSNE_batch <- j + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = batch)) +
theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9,
"Set1"))(length(unique(pData(tmp)$batch))))
lincRNA_tSNE_batch
# tSNE by lincRNA without Xist
sex_blind.dat <- subset(dat, fData(dat)$transcript_type %in% "lincRNA")
sex_blind_list <- as.vector(rownames(exprs(sex_blind.dat)[-1, ]))
sex_blind.dat <- subset(sex_blind.dat, rownames(exprs(sex_blind.dat)) %in% sex_blind_list)
sex_blind_lincRNA.tSNE <- Rtsne(t(round(vstExprs(sex_blind.dat))), theta = 0.1,
check_duplicates = F)
pData(sex_blind.dat)$tSNE1_pos <- sex_blind_lincRNA.tSNE$Y[, 1]
pData(sex_blind.dat)$tSNE2_pos <- sex_blind_lincRNA.tSNE$Y[, 2]
n <- ggplot(pData(sex_blind.dat))
sex_blind_tSNE <- n + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) +
theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")
sex_blind_tSNE
Seperate the “dat” CellDataSet by “Cluster” and calculate mean expression of genes
# Seperate the 'dat' CellDataSet by 'Cluster' and calculate mean expression
# of genes
Cluster.split <- lapply(unique(pData(dat)$group_num), function(x) {
dat[, pData(dat)$group_num == x]
})
Cluster.split <- lapply(c(1:length(Cluster.split)), function(x) {
detectGenes(Cluster.split[[x]], min_expr = 1)
})
Cluster.split <- lapply(c(1:length(Cluster.split)), function(i) {
x <- Cluster.split[[i]]
x[fData(x)$num_cells_expressed > 2]
})
Cluster.split <- lapply(Cluster.split, function(x) {
mean_cpc <- apply(exprs(x), 1, mean)
fData(x)$mean_cpc <- mean_cpc
return(x)
})
Cluster.split <- lapply(c(1:length(Cluster.split)), function(i) {
x <- Cluster.split[[i]]
frac_cells_expressed <- (fData(x)$num_cells_expressed/length(rownames(pData(x))))
fData(x)$frac_cells_expressed <- frac_cells_expressed
return(x)
})
tmp <- data.frame()
group_means <- lapply(c(1:length(Cluster.split)), function(i) {
x <- Cluster.split[[i]]
res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$transcript_type,
mean_cpc = fData(x)$mean_cpc, group_num = unique(pData(x)$group_num),
frac_cells_expressed = fData(x)$frac_cells_expressed)
tmp <- rbind(tmp, res)
})
tmp <- plyr::ldply(group_means, data.frame)
group_means <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))
density.plot_Cluster <- ggplot(group_means) + geom_density(aes(x = log10(mean_cpc),
color = gene_type)) + facet_grid(. ~ group_num, labeller = labeller(group_num = function(x) {
paste("Cluster", x, sep = ":")
})) + scale_color_manual(values = c("red", "black")) + theme_bw()
density.plot_Cluster
# Generate box plots
boxplot_cluster <- ggplot(group_means, aes(gene_type, frac_cells_expressed,
fill = gene_type))
boxplot_cluster + geom_boxplot() + theme_bw() + scale_fill_manual(values = c("red",
"grey50")) + facet_grid(~group_num, labeller = labeller(group_num = function(x) {
paste("Cluster", x, sep = ":")
}))
Seperate the “dat” CellDataSet by level2class and calculate mean expression of genes
# Seperate the 'dat' CellDataSet by level2class and calculate mean
# expression of genes
level2.split <- lapply(unique(pData(dat)[pData(dat)$level1class == "interneurons",
]$level2class), function(x) {
dat[, pData(dat)$level2class == x]
})
level2.split <- lapply(c(1:length(level2.split)), function(x) {
detectGenes(level2.split[[x]], min_expr = 1)
})
level1.split <- lapply(c(1:length(level2.split)), function(i) {
x <- level2.split[[i]]
x[fData(x)$num_cells_expressed >= 1]
})
level2.split <- lapply(level2.split, function(x) {
mean_cpc <- apply(exprs(x), 1, mean)
fData(x)$mean_cpc <- mean_cpc
return(x)
})
tmp <- data.frame()
group_means_level2class <- lapply(c(1:length(level2.split)), function(i) {
x <- level2.split[[i]]
res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$transcript_type,
mean_cpc = fData(x)$mean_cpc, level2class = unique(pData(x)$level2class))
tmp <- rbind(tmp, res)
})
tmp <- plyr::ldply(group_means_level2class, data.frame)
group_means_level2class <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))
density.plot_level2class <- ggplot(group_means_level2class) + geom_density(aes(x = log10(mean_cpc),
color = gene_type)) + facet_wrap(~level2class, nrow = 2) + scale_color_manual(values = c("red",
"black")) + theme_bw()
density.plot_level2class
## Warning: Removed 107012 rows containing non-finite values (stat_density).
level_2_barplot <- lapply(c(1:9), function(x) {
pData(Cluster.split[[x]])$level2class %>% unique() %>% length()
})
level_2_barplot <- data.frame(level_2_barplot)
colnames(level_2_barplot) <- c("1", "2", "3", "4", "5", "6", "7", "8", "9")
rownames(level_2_barplot) <- "Number_of_Clusters"
level_2_barplot <- as.data.frame(level_2_barplot)
level_2_barplot <- as.data.frame(t(level_2_barplot))
bar.plot <- ggplot(level_2_barplot)
bar.plot + geom_bar(aes(x = rownames(level_2_barplot), y = level_2_barplot$Number_of_Clusters,
fill = rownames(level_2_barplot)), colour = "black", width = 0.5, stat = "identity") +
scale_fill_brewer(palette = "Set1", guide = guide_legend(title = "Cluster #")) +
xlab("Cluster #") + ylab("Number of cellular subtypes") + theme_bw() + theme(axis.text.x = element_blank())
complete.level2.split <- lapply(unique(pData(dat)$level2class), function(x) {
dat[, pData(dat)$level2class == x]
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(x) {
detectGenes(complete.level2.split[[x]], min_expr = 1)
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(i) {
x <- complete.level2.split[[i]]
x[fData(x)$num_cells_expressed >= 1]
})
complete.level2.split <- lapply(complete.level2.split, function(x) {
mean_cpc <- apply(exprs(x), 1, mean)
fData(x)$mean_cpc <- mean_cpc
return(x)
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(i) {
x = complete.level2.split[[i]]
fData(x)$gene_std_pc <- rowSds(exprs(x))
return(x)
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(i) {
x = complete.level2.split[[i]]
fData(x)$BCV <- as.vector(fData(x)$gene_std_pc)/as.vector(fData(x)$mean_cpc)
return(x)
})
complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))),
function(i) {
x = complete.level2.split[[i]]
fData(x)$frac_cells_expressed <- fData(x)$num_cells_expressed/length(rownames(pData(x)))
return(x)
})
tmp <- data.frame()
bcv_level2class <- lapply(c(1:length(complete.level2.split)), function(i) {
x <- complete.level2.split[[i]]
res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$transcript_type,
mean_cpc = fData(x)$mean_cpc, BCV = fData(x)$BCV, gene_sd_pc = fData(x)$gene_std_pc,
level2class = unique(pData(x)$level2class), frac_cells_expressed = fData(x)$frac_cells_expressed)
tmp <- rbind(tmp, res)
})
names(bcv_level2class) <- unique(pData(dat)$level2class)
level2_bcv <- Reduce(function(...) merge(..., all = TRUE), bcv_level2class)
level2_bcv <- subset(level2_bcv, gene_type %in% c("lincRNA", "protein_coding"))
# Generate density plots
density.plot_all_level2class <- ggplot(level2_bcv) + geom_density(aes(x = log10(mean_cpc),
color = gene_type)) + facet_wrap(~level2class, nrow = 8) + scale_color_manual(values = c("red",
"black")) + theme_bw()
density.plot_all_level2class
# Generate box plots
boxplot_level2Class <- ggplot(level2_bcv, aes(gene_type, frac_cells_expressed,
fill = gene_type))
boxplot_level2Class + geom_boxplot() + theme_bw() + scale_fill_manual(values = c("red",
"grey50")) + facet_wrap(~level2class, nrow = 8)
cluster.bcv.dat <- lapply(unique(pData(dat)$group_num), function(x) {
dat[, pData(dat)$group_num == x]
})
cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(x) {
detectGenes(cluster.bcv.dat[[x]], min_expr = 1)
})
cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
x <- cluster.bcv.dat[[i]]
x[fData(x)$num_cells_expressed > 2]
})
cluster.bcv.dat <- lapply(cluster.bcv.dat, function(x) {
mean_cpc <- apply(exprs(x), 1, mean)
fData(x)$mean_cpc <- mean_cpc
return(x)
})
cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
x = cluster.bcv.dat[[i]]
fData(x)$gene_std_pc <- rowSds(exprs(x))
return(x)
})
cluster.bcv.dat <- lapply(c(1:length(unique(cluster.bcv.dat))), function(i) {
x = cluster.bcv.dat[[i]]
fData(x)$BCV <- as.vector(fData(x)$gene_std_pc)/as.vector(fData(x)$mean_cpc)
return(x)
})
tmp <- data.frame()
bcv_cluster <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
x <- cluster.bcv.dat[[i]]
res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$transcript_type,
mean_cpc = fData(x)$mean_cpc, BCV = fData(x)$BCV, gene_sd_pc = fData(x)$gene_std_pc,
cluster_num = rep(i))
tmp <- rbind(tmp, res)
})
bcv <- Reduce(function(...) merge(..., all = TRUE), bcv_cluster)
linc_bcv <- subset(bcv, bcv$gene_type == "lincRNA")
mRNA_bcv <- subset(bcv, bcv$gene_type == "protein_coding")
delta_median_by_cluster <- lapply(c(1:9), function(x) {
median(subset(mRNA_bcv, mRNA_bcv$cluster_num == x)$mean_cpc) - median(subset(linc_bcv,
linc_bcv$cluster_num == x)$mean_cpc)
})
names(delta_median_by_cluster) <- c("Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4",
"Cluster_5", "Cluster_6", "Cluster_7", "Cluster_8", "Cluster_9")
delta_median_by_cluster <- as.matrix(delta_median_by_cluster)
delta_median_by_cluster <- as.data.frame(delta_median_by_cluster)
colnames(delta_median_by_cluster) <- "delta_median_mean_cpc"
delta_median_by_cluster$delta_median_mean_cpc <- as.numeric(delta_median_by_cluster$delta_median_mean_cpc)
delta_median_by_cluster$Number_of_subtypes <- as.vector(level_2_barplot$Number_of_Clusters)
p <- ggplot(delta_median_by_cluster, aes(x = Number_of_subtypes, y = delta_median_mean_cpc))
scatter.plot <- p + geom_point(aes(color = rownames(delta_median_by_cluster))) +
geom_smooth(aes(alpha = 0.1), method = "lm", se = T, color = "black", alpha = 0.2) +
scale_color_brewer(palette = "Set1", guide = guide_legend(title = "Cluster #")) +
guides(fill = "none") + theme_bw() + xlab("Number of cellular subtypes") +
ylab("Δ median copies per cell (mRNA-lncRNA)") + theme(legend.position = "none")
scatter.plot